library(nnet)
library(rattle)
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.3
library(MASS)
library(stargazer)
## Warning: package 'stargazer' was built under R version 3.4.4
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(DT)
## Warning: package 'DT' was built under R version 3.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
train<-read.csv("~/Desktop/Programming for analytics/week 6/Prudential/train.csv")
test<-read.csv("~/Desktop/Programming for analytics/week 6/Prudential/test.csv")
head(train)
## Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4
## 1 2 1 D3 10 0.07692308
## 2 5 1 A1 26 0.07692308
## 3 6 1 E1 26 0.07692308
## 4 7 1 D4 10 0.48717949
## 5 8 1 D2 26 0.23076923
## 6 10 1 D2 26 0.23076923
## Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht
## 1 2 1 1 0.64179104 0.5818182
## 2 2 3 1 0.05970149 0.6000000
## 3 2 3 1 0.02985075 0.7454545
## 4 2 3 1 0.16417910 0.6727273
## 5 2 3 1 0.41791045 0.6545455
## 6 3 1 1 0.50746269 0.8363636
## Wt BMI Employment_Info_1 Employment_Info_2
## 1 0.1485356 0.3230080 0.028 12
## 2 0.1317992 0.2722877 0.000 1
## 3 0.2887029 0.4287804 0.030 9
## 4 0.2050209 0.3524377 0.042 9
## 5 0.2343096 0.4240456 0.027 9
## 6 0.2991632 0.3648867 0.325 15
## Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6
## 1 1 0 3 NA
## 2 3 0 2 0.0018
## 3 1 0 2 0.0300
## 4 1 0 3 0.2000
## 5 1 0 2 0.0500
## 6 1 0 2 1.0000
## InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5
## 1 1 2 6 3 1
## 2 1 2 6 3 1
## 3 1 2 8 3 1
## 4 2 2 8 3 1
## 5 1 2 6 3 1
## 6 1 2 8 3 1
## InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 1 2 1 1 1
## 2 2 1 2 1
## 3 1 1 2 1
## 4 2 1 2 1
## 5 2 1 2 1
## 6 1 1 2 1
## Insurance_History_3 Insurance_History_4 Insurance_History_5
## 1 3 1 0.000666667
## 2 3 1 0.000133333
## 3 1 3 NA
## 4 1 3 NA
## 5 1 3 NA
## 6 3 2 0.005000000
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 1 1 1 2
## 2 1 3 2
## 3 3 2 3
## 4 3 2 3
## 5 3 2 3
## 6 1 3 2
## Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5
## 1 2 NA 0.5980392 NA 0.5267857
## 2 2 0.1884058 NA 0.08450704 NA
## 3 3 0.3043478 NA 0.22535211 NA
## 4 3 0.4202899 NA 0.35211268 NA
## 5 2 0.4637681 NA 0.40845070 NA
## 6 2 NA 0.2941176 0.50704225 NA
## Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4
## 1 4 112 2 1
## 2 5 412 2 1
## 3 10 3 2 2
## 4 0 350 2 2
## 5 NA 162 2 2
## 6 6 491 2 2
## Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8
## 1 1 3 2 2
## 2 1 3 2 2
## 3 1 3 2 2
## 4 1 3 2 2
## 5 1 3 2 2
## 6 1 3 2 2
## Medical_History_9 Medical_History_10 Medical_History_11
## 1 1 NA 3
## 2 1 NA 3
## 3 2 NA 3
## 4 2 NA 3
## 5 2 NA 3
## 6 2 NA 3
## Medical_History_12 Medical_History_13 Medical_History_14
## 1 2 3 3
## 2 2 3 3
## 3 2 3 3
## 4 2 3 3
## 5 2 3 3
## 6 2 3 3
## Medical_History_15 Medical_History_16 Medical_History_17
## 1 240 3 3
## 2 0 1 3
## 3 NA 1 3
## 4 NA 1 3
## 5 NA 1 3
## 6 NA 1 3
## Medical_History_18 Medical_History_19 Medical_History_20
## 1 1 1 2
## 2 1 1 2
## 3 1 1 2
## 4 1 1 2
## 5 1 1 2
## 6 2 1 2
## Medical_History_21 Medical_History_22 Medical_History_23
## 1 1 2 3
## 2 1 2 3
## 3 1 2 3
## 4 2 2 3
## 5 1 2 3
## 6 2 2 3
## Medical_History_24 Medical_History_25 Medical_History_26
## 1 NA 1 3
## 2 NA 1 3
## 3 NA 2 2
## 4 NA 1 3
## 5 NA 2 2
## 6 NA 1 3
## Medical_History_27 Medical_History_28 Medical_History_29
## 1 3 1 3
## 2 3 1 3
## 3 3 1 3
## 4 3 1 3
## 5 3 1 3
## 6 3 1 3
## Medical_History_30 Medical_History_31 Medical_History_32
## 1 2 3 NA
## 2 2 3 NA
## 3 2 3 NA
## 4 2 3 NA
## 5 2 3 NA
## 6 2 3 NA
## Medical_History_33 Medical_History_34 Medical_History_35
## 1 1 3 1
## 2 3 1 1
## 3 3 3 1
## 4 3 3 1
## 5 3 3 1
## 6 3 1 1
## Medical_History_36 Medical_History_37 Medical_History_38
## 1 2 2 1
## 2 2 2 1
## 3 3 2 1
## 4 2 2 1
## 5 3 2 1
## 6 2 2 1
## Medical_History_39 Medical_History_40 Medical_History_41
## 1 3 3 3
## 2 3 3 1
## 3 3 3 1
## 4 3 3 1
## 5 3 3 1
## 6 3 3 3
## Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 1 0
## Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 1 0
## Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_48 Response
## 1 0 8
## 2 0 4
## 3 0 8
## 4 0 8
## 5 0 8
## 6 0 8
head(test)
## Id Product_Info_1 Product_Info_2 Product_Info_3 Product_Info_4
## 1 1 1 D3 26 0.48717949
## 2 3 1 A2 26 0.07692308
## 3 4 1 D3 26 0.14466667
## 4 9 1 A1 26 0.15170872
## 5 12 1 A1 26 0.07692308
## 6 13 1 D3 26 0.23076923
## Product_Info_5 Product_Info_6 Product_Info_7 Ins_Age Ht
## 1 2 3 1 0.6119403 0.7818182
## 2 2 3 1 0.6268657 0.7272727
## 3 2 3 1 0.5820896 0.7090909
## 4 2 1 1 0.5223881 0.6545455
## 5 2 3 1 0.2985075 0.6727273
## 6 2 3 1 0.5671642 0.8181818
## Wt BMI Employment_Info_1 Employment_Info_2
## 1 0.3389121 0.4722616 0.150 3
## 2 0.3117155 0.4849840 0.000 1
## 3 0.3200837 0.5191032 0.143 9
## 4 0.2677824 0.4869621 0.210 9
## 5 0.2468619 0.4287182 0.085 9
## 6 0.2991632 0.3797544 0.075 9
## Employment_Info_3 Employment_Info_4 Employment_Info_5 Employment_Info_6
## 1 1 0.00 2 0.50
## 2 3 0.07 2 0.20
## 3 1 0.00 2 0.45
## 4 1 0.00 2 1.00
## 5 1 0.00 2 0.20
## 6 1 0.00 2 0.40
## InsuredInfo_1 InsuredInfo_2 InsuredInfo_3 InsuredInfo_4 InsuredInfo_5
## 1 2 2 11 3 1
## 2 1 2 8 3 1
## 3 1 2 3 3 1
## 4 2 2 3 3 1
## 5 1 2 8 3 1
## 6 1 2 8 3 1
## InsuredInfo_6 InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 1 1 1 2 1
## 2 1 1 1 1
## 3 1 1 2 1
## 4 1 1 1 1
## 5 2 1 2 1
## 6 1 1 2 1
## Insurance_History_3 Insurance_History_4 Insurance_History_5
## 1 1 3 NA
## 2 3 1 0.001666667
## 3 1 3 NA
## 4 3 1 0.000666667
## 5 1 3 NA
## 6 1 3 NA
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 1 3 2 3
## 2 1 1 2
## 3 3 2 3
## 4 2 1 2
## 5 3 2 3
## 6 3 2 3
## Family_Hist_1 Family_Hist_2 Family_Hist_3 Family_Hist_4 Family_Hist_5
## 1 3 NA 0.6274510 0.7605634 NA
## 2 2 NA 0.5294118 0.7464789 NA
## 3 3 0.6666667 NA 0.6619718 NA
## 4 2 NA 0.6862745 0.6760563 NA
## 5 2 0.4492754 NA 0.3802817 NA
## 6 3 NA 0.6470588 NA 0.5535714
## Medical_History_1 Medical_History_2 Medical_History_3 Medical_History_4
## 1 2 16 2 2
## 2 5 261 3 1
## 3 3 132 2 1
## 4 NA 162 3 2
## 5 18 181 3 1
## 6 4 335 2 2
## Medical_History_5 Medical_History_6 Medical_History_7 Medical_History_8
## 1 1 3 1 2
## 2 1 3 2 2
## 3 1 3 2 2
## 4 1 1 2 3
## 5 1 3 2 2
## 6 1 3 2 2
## Medical_History_9 Medical_History_10 Medical_History_11
## 1 2 NA 3
## 2 1 NA 3
## 3 2 NA 3
## 4 2 NA 3
## 5 2 NA 3
## 6 2 NA 3
## Medical_History_12 Medical_History_13 Medical_History_14
## 1 2 1 3
## 2 2 3 3
## 3 2 3 3
## 4 2 3 3
## 5 2 3 3
## 6 2 3 3
## Medical_History_15 Medical_History_16 Medical_History_17
## 1 NA 1 2
## 2 110 3 3
## 3 240 1 3
## 4 NA 1 3
## 5 188 1 3
## 6 NA 1 3
## Medical_History_18 Medical_History_19 Medical_History_20
## 1 1 1 2
## 2 1 1 2
## 3 1 1 2
## 4 1 1 2
## 5 1 1 2
## 6 1 1 2
## Medical_History_21 Medical_History_22 Medical_History_23
## 1 1 2 1
## 2 1 2 3
## 3 1 2 3
## 4 2 2 3
## 5 1 2 1
## 6 1 2 3
## Medical_History_24 Medical_History_25 Medical_History_26
## 1 NA 2 2
## 2 NA 2 2
## 3 NA 2 2
## 4 NA 1 3
## 5 NA 1 3
## 6 NA 2 2
## Medical_History_27 Medical_History_28 Medical_History_29
## 1 1 1 3
## 2 3 1 3
## 3 3 1 1
## 4 3 2 3
## 5 3 1 1
## 6 3 1 3
## Medical_History_30 Medical_History_31 Medical_History_32
## 1 2 3 NA
## 2 2 3 NA
## 3 2 3 NA
## 4 2 3 NA
## 5 2 3 NA
## 6 2 3 NA
## Medical_History_33 Medical_History_34 Medical_History_35
## 1 3 3 1
## 2 3 3 1
## 3 1 3 1
## 4 3 1 1
## 5 3 3 1
## 6 3 3 1
## Medical_History_36 Medical_History_37 Medical_History_38
## 1 3 2 1
## 2 3 2 1
## 3 3 2 1
## 4 2 2 1
## 5 2 2 1
## 6 3 2 1
## Medical_History_39 Medical_History_40 Medical_History_41
## 1 3 3 3
## 2 3 3 1
## 3 3 3 3
## 4 3 3 3
## 5 3 3 3
## 6 3 3 1
## Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3 Medical_Keyword_4
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Medical_Keyword_5 Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 1 1 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 1 0 1 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 1 0 1 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 1 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 1
## 5 0 0 0
## 6 0 0 0
## Medical_Keyword_48
## 1 0
## 2 0
## 3 0
## 4 1
## 5 0
## 6 0
str (train)
## 'data.frame': 59381 obs. of 128 variables:
## $ Id : int 2 5 6 7 8 10 11 14 15 16 ...
## $ Product_Info_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_2 : Factor w/ 19 levels "A1","A2","A3",..: 17 1 19 18 16 16 8 16 17 19 ...
## $ Product_Info_3 : int 10 26 26 10 26 26 10 26 26 21 ...
## $ Product_Info_4 : num 0.0769 0.0769 0.0769 0.4872 0.2308 ...
## $ Product_Info_5 : int 2 2 2 2 2 3 2 2 2 2 ...
## $ Product_Info_6 : int 1 3 3 3 3 1 3 3 3 3 ...
## $ Product_Info_7 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Ins_Age : num 0.6418 0.0597 0.0299 0.1642 0.4179 ...
## $ Ht : num 0.582 0.6 0.745 0.673 0.655 ...
## $ Wt : num 0.149 0.132 0.289 0.205 0.234 ...
## $ BMI : num 0.323 0.272 0.429 0.352 0.424 ...
## $ Employment_Info_1 : num 0.028 0 0.03 0.042 0.027 0.325 0.11 0.12 0.165 0.025 ...
## $ Employment_Info_2 : int 12 1 9 9 9 15 1 12 9 1 ...
## $ Employment_Info_3 : int 1 3 1 1 1 1 3 1 1 3 ...
## $ Employment_Info_4 : num 0 0 0 0 0 0 NA 0 0 0 ...
## $ Employment_Info_5 : int 3 2 2 3 2 2 3 2 2 3 ...
## $ Employment_Info_6 : num NA 0.0018 0.03 0.2 0.05 1 0.8 1 1 0.05 ...
## $ InsuredInfo_1 : int 1 1 1 2 1 1 1 1 1 2 ...
## $ InsuredInfo_2 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ InsuredInfo_3 : int 6 6 8 8 6 8 3 6 3 3 ...
## $ InsuredInfo_4 : int 3 3 3 3 3 3 3 3 2 3 ...
## $ InsuredInfo_5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ InsuredInfo_6 : int 2 2 1 2 2 1 2 1 1 2 ...
## $ InsuredInfo_7 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Insurance_History_1: int 1 2 2 2 2 2 1 1 1 2 ...
## $ Insurance_History_2: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Insurance_History_3: int 3 3 1 1 1 3 3 3 3 3 ...
## $ Insurance_History_4: int 1 1 3 3 3 2 2 1 2 1 ...
## $ Insurance_History_5: num 0.000667 0.000133 NA NA NA ...
## $ Insurance_History_7: int 1 1 3 3 3 1 1 1 1 1 ...
## $ Insurance_History_8: int 1 3 2 2 2 3 1 1 1 3 ...
## $ Insurance_History_9: int 2 2 3 3 3 2 2 2 2 2 ...
## $ Family_Hist_1 : int 2 2 3 3 2 2 3 2 3 3 ...
## $ Family_Hist_2 : num NA 0.188 0.304 0.42 0.464 ...
## $ Family_Hist_3 : num 0.598 NA NA NA NA ...
## $ Family_Hist_4 : num NA 0.0845 0.2254 0.3521 0.4085 ...
## $ Family_Hist_5 : num 0.527 NA NA NA NA ...
## $ Medical_History_1 : int 4 5 10 0 NA 6 5 6 4 NA ...
## $ Medical_History_2 : int 112 412 3 350 162 491 600 145 16 162 ...
## $ Medical_History_3 : int 2 2 2 2 2 2 3 2 2 2 ...
## $ Medical_History_4 : int 1 1 2 2 2 2 2 2 2 2 ...
## $ Medical_History_5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_6 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_7 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_8 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_9 : int 1 1 2 2 2 2 1 1 1 2 ...
## $ Medical_History_10 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_11 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_12 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_13 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_14 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_15 : int 240 0 NA NA NA NA NA NA NA NA ...
## $ Medical_History_16 : int 3 1 1 1 1 1 1 1 1 3 ...
## $ Medical_History_17 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_18 : int 1 1 1 1 1 2 1 1 1 1 ...
## $ Medical_History_19 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_20 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_21 : int 1 1 1 2 1 2 1 1 1 1 ...
## $ Medical_History_22 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_23 : int 3 3 3 3 3 3 3 3 3 1 ...
## $ Medical_History_24 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_25 : int 1 1 2 1 2 1 1 1 1 1 ...
## $ Medical_History_26 : int 3 3 2 3 2 3 3 3 3 3 ...
## $ Medical_History_27 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_28 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_29 : int 3 3 3 3 3 3 1 3 1 3 ...
## $ Medical_History_30 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_31 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_32 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_33 : int 1 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_34 : int 3 1 3 3 3 1 3 3 3 3 ...
## $ Medical_History_35 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_36 : int 2 2 3 2 3 2 2 2 2 2 ...
## $ Medical_History_37 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_38 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_39 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_40 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_41 : int 3 1 1 1 1 3 3 1 3 1 ...
## $ Medical_Keyword_1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_9 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_10 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_13 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_14 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_15 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Medical_Keyword_16 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_18 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_19 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_20 : int 0 0 0 0 0 0 0 0 1 0 ...
## [list output truncated]
str(test)
## 'data.frame': 19765 obs. of 127 variables:
## $ Id : int 1 3 4 9 12 13 21 28 30 36 ...
## $ Product_Info_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_2 : Factor w/ 19 levels "A1","A2","A3",..: 17 2 17 1 1 17 3 18 17 3 ...
## $ Product_Info_3 : int 26 26 26 26 26 26 26 26 26 26 ...
## $ Product_Info_4 : num 0.4872 0.0769 0.1447 0.1517 0.0769 ...
## $ Product_Info_5 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Product_Info_6 : int 3 3 3 1 3 3 3 3 3 3 ...
## $ Product_Info_7 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Ins_Age : num 0.612 0.627 0.582 0.522 0.299 ...
## $ Ht : num 0.782 0.727 0.709 0.655 0.673 ...
## $ Wt : num 0.339 0.312 0.32 0.268 0.247 ...
## $ BMI : num 0.472 0.485 0.519 0.487 0.429 ...
## $ Employment_Info_1 : num 0.15 0 0.143 0.21 0.085 0.075 0.14 0.025 0.035 0.06 ...
## $ Employment_Info_2 : int 3 1 9 9 9 9 9 9 9 9 ...
## $ Employment_Info_3 : int 1 3 1 1 1 1 1 1 1 1 ...
## $ Employment_Info_4 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ Employment_Info_5 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Employment_Info_6 : num 0.5 0.2 0.45 1 0.2 0.4 1 0 NA 1 ...
## $ InsuredInfo_1 : int 2 1 1 2 1 1 2 1 2 1 ...
## $ InsuredInfo_2 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ InsuredInfo_3 : int 11 8 3 3 8 8 3 2 8 8 ...
## $ InsuredInfo_4 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ InsuredInfo_5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ InsuredInfo_6 : int 1 1 1 1 2 1 1 2 1 1 ...
## $ InsuredInfo_7 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Insurance_History_1: int 2 1 2 1 2 2 2 2 2 2 ...
## $ Insurance_History_2: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Insurance_History_3: int 1 3 1 3 1 1 3 1 1 1 ...
## $ Insurance_History_4: int 3 1 3 1 3 3 1 3 3 3 ...
## $ Insurance_History_5: num NA 0.001667 NA 0.000667 NA ...
## $ Insurance_History_7: int 3 1 3 2 3 3 1 3 3 3 ...
## $ Insurance_History_8: int 2 1 2 1 2 2 3 2 2 2 ...
## $ Insurance_History_9: int 3 2 3 2 3 3 2 3 3 3 ...
## $ Family_Hist_1 : int 3 2 3 2 2 3 2 2 3 3 ...
## $ Family_Hist_2 : num NA NA 0.667 NA 0.449 ...
## $ Family_Hist_3 : num 0.627 0.529 NA 0.686 NA ...
## $ Family_Hist_4 : num 0.761 0.746 0.662 0.676 0.38 ...
## $ Family_Hist_5 : num NA NA NA NA NA ...
## $ Medical_History_1 : int 2 5 3 NA 18 4 21 0 2 NA ...
## $ Medical_History_2 : int 16 261 132 162 181 335 112 491 112 162 ...
## $ Medical_History_3 : int 2 3 2 3 3 2 2 2 2 3 ...
## $ Medical_History_4 : int 2 1 1 2 1 2 1 2 2 2 ...
## $ Medical_History_5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_6 : int 3 3 3 1 3 3 3 3 3 3 ...
## $ Medical_History_7 : int 1 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_8 : int 2 2 2 3 2 2 2 2 2 2 ...
## $ Medical_History_9 : int 2 1 2 2 2 2 1 2 2 2 ...
## $ Medical_History_10 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_11 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_12 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_13 : int 1 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_14 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_15 : int NA 110 240 NA 188 NA 82 NA NA NA ...
## $ Medical_History_16 : int 1 3 1 1 1 1 1 1 1 1 ...
## $ Medical_History_17 : int 2 3 3 3 3 3 2 3 3 3 ...
## $ Medical_History_18 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_19 : int 1 1 1 1 1 1 1 2 1 1 ...
## $ Medical_History_20 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_21 : int 1 1 1 2 1 1 1 1 1 1 ...
## $ Medical_History_22 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_23 : int 1 3 3 3 1 3 3 3 1 3 ...
## $ Medical_History_24 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_25 : int 2 2 2 1 1 2 1 1 1 1 ...
## $ Medical_History_26 : int 2 2 2 3 3 2 3 3 3 3 ...
## $ Medical_History_27 : int 1 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_28 : int 1 1 1 2 1 1 1 1 1 1 ...
## $ Medical_History_29 : int 3 3 1 3 1 3 3 3 3 3 ...
## $ Medical_History_30 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_31 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_32 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_33 : int 3 3 1 3 3 3 1 3 3 3 ...
## $ Medical_History_34 : int 3 3 3 1 3 3 3 3 3 1 ...
## $ Medical_History_35 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_36 : int 3 3 3 2 2 3 2 2 2 2 ...
## $ Medical_History_37 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_38 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_39 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_40 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_41 : int 3 1 3 3 3 1 1 3 1 1 ...
## $ Medical_Keyword_1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_9 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_10 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_13 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_14 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_15 : int 1 0 0 0 0 0 0 0 1 0 ...
## $ Medical_Keyword_16 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_18 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_19 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_20 : int 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
var_kind<-c("Product_Info_", "Ins_Age", "Ht", "Wt","BMI","Employment_Info_","InsuredInfo_",
"Insurance_History_", "Family_Hist_","Medical_History_", "Medical_Keyword_")
Removing variables with excess NAs in both test and train making a function with minimum threshold on train and test data As a preliminary step in data treatment, variables that have a high percentage of missing values are removed. While the threshold for removal is user determined, for this exercise the threshold was 30%.
sapply(train, function(x) sum(is.na(x)) )
## Id Product_Info_1 Product_Info_2
## 0 0 0
## Product_Info_3 Product_Info_4 Product_Info_5
## 0 0 0
## Product_Info_6 Product_Info_7 Ins_Age
## 0 0 0
## Ht Wt BMI
## 0 0 0
## Employment_Info_1 Employment_Info_2 Employment_Info_3
## 19 0 0
## Employment_Info_4 Employment_Info_5 Employment_Info_6
## 6779 0 10854
## InsuredInfo_1 InsuredInfo_2 InsuredInfo_3
## 0 0 0
## InsuredInfo_4 InsuredInfo_5 InsuredInfo_6
## 0 0 0
## InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 0 0 0
## Insurance_History_3 Insurance_History_4 Insurance_History_5
## 0 0 25396
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 0 0 0
## Family_Hist_1 Family_Hist_2 Family_Hist_3
## 0 28656 34241
## Family_Hist_4 Family_Hist_5 Medical_History_1
## 19184 41811 8889
## Medical_History_2 Medical_History_3 Medical_History_4
## 0 0 0
## Medical_History_5 Medical_History_6 Medical_History_7
## 0 0 0
## Medical_History_8 Medical_History_9 Medical_History_10
## 0 0 58824
## Medical_History_11 Medical_History_12 Medical_History_13
## 0 0 0
## Medical_History_14 Medical_History_15 Medical_History_16
## 0 44596 0
## Medical_History_17 Medical_History_18 Medical_History_19
## 0 0 0
## Medical_History_20 Medical_History_21 Medical_History_22
## 0 0 0
## Medical_History_23 Medical_History_24 Medical_History_25
## 0 55580 0
## Medical_History_26 Medical_History_27 Medical_History_28
## 0 0 0
## Medical_History_29 Medical_History_30 Medical_History_31
## 0 0 0
## Medical_History_32 Medical_History_33 Medical_History_34
## 58274 0 0
## Medical_History_35 Medical_History_36 Medical_History_37
## 0 0 0
## Medical_History_38 Medical_History_39 Medical_History_40
## 0 0 0
## Medical_History_41 Medical_Keyword_1 Medical_Keyword_2
## 0 0 0
## Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5
## 0 0 0
## Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 0 0 0
## Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 0 0 0
## Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 0 0 0
## Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 0 0 0
## Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 0 0 0
## Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 0 0 0
## Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 0 0 0
## Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 0 0 0
## Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 0 0 0
## Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 0 0 0
## Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 0 0 0
## Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 0 0 0
## Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 0 0 0
## Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 0 0 0
## Medical_Keyword_48 Response
## 0 0
sapply(test, function(x) sum(is.na(x)) )
## Id Product_Info_1 Product_Info_2
## 0 0 0
## Product_Info_3 Product_Info_4 Product_Info_5
## 0 0 0
## Product_Info_6 Product_Info_7 Ins_Age
## 0 0 0
## Ht Wt BMI
## 0 0 0
## Employment_Info_1 Employment_Info_2 Employment_Info_3
## 3 0 0
## Employment_Info_4 Employment_Info_5 Employment_Info_6
## 2137 0 3787
## InsuredInfo_1 InsuredInfo_2 InsuredInfo_3
## 0 0 0
## InsuredInfo_4 InsuredInfo_5 InsuredInfo_6
## 0 0 0
## InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 0 0 0
## Insurance_History_3 Insurance_History_4 Insurance_History_5
## 0 0 8105
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 0 0 0
## Family_Hist_1 Family_Hist_2 Family_Hist_3
## 0 9880 11064
## Family_Hist_4 Family_Hist_5 Medical_History_1
## 6677 13624 2972
## Medical_History_2 Medical_History_3 Medical_History_4
## 0 0 0
## Medical_History_5 Medical_History_6 Medical_History_7
## 0 0 0
## Medical_History_8 Medical_History_9 Medical_History_10
## 0 0 19564
## Medical_History_11 Medical_History_12 Medical_History_13
## 0 0 0
## Medical_History_14 Medical_History_15 Medical_History_16
## 0 14864 0
## Medical_History_17 Medical_History_18 Medical_History_19
## 0 0 0
## Medical_History_20 Medical_History_21 Medical_History_22
## 0 0 0
## Medical_History_23 Medical_History_24 Medical_History_25
## 0 18585 0
## Medical_History_26 Medical_History_27 Medical_History_28
## 0 0 0
## Medical_History_29 Medical_History_30 Medical_History_31
## 0 0 0
## Medical_History_32 Medical_History_33 Medical_History_34
## 19414 0 0
## Medical_History_35 Medical_History_36 Medical_History_37
## 0 0 0
## Medical_History_38 Medical_History_39 Medical_History_40
## 0 0 0
## Medical_History_41 Medical_Keyword_1 Medical_Keyword_2
## 0 0 0
## Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5
## 0 0 0
## Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 0 0 0
## Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 0 0 0
## Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 0 0 0
## Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 0 0 0
## Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 0 0 0
## Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 0 0 0
## Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 0 0 0
## Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 0 0 0
## Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 0 0 0
## Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 0 0 0
## Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 0 0 0
## Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 0 0 0
## Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 0 0 0
## Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 0 0 0
## Medical_Keyword_48
## 0
rmNAvars<-function(dat,threshold){
dat<-dat[, -which(colMeans(is.na(dat)) > threshold)]
}
train_clean<-rmNAvars(train,0.3)
test_clean<-test[,intersect(colnames(test), colnames(train_clean))]
Replacing/Imputing Missing values with Median as Median is not sensitive to outliers For the variables that are not dropped at the previous step of modeling, variables that have missing values in lesser percentages are imputed. The methodology used for imputation is using median of the remaining data series. This is a commonly used industry practice and is efficient as the missing data for all variables is randomly distributed over the response variable.
manage_na <- function(datafra)
{
for(i in 1:ncol(datafra))
{
if(is.numeric(datafra[,i]))
{
datafra[is.na(datafra[,i]),i] <- median(datafra[!is.na(datafra[,i]),i])
}
}
datafra
}
train_clean <- manage_na(train_clean)
test_clean <- manage_na(test)
train_conti<-train_clean[,c("Product_Info_4", "Ins_Age", "Ht", "Wt", "BMI",
"Employment_Info_1", "Employment_Info_4", "Employment_Info_6")]
train_clean[, !(sapply(train_clean, class) == "numeric" | sapply(train_clean, class) ==
"integer")]<-
as.numeric(train_clean[, !(sapply(train_clean, class) == "numeric" |
sapply(train_clean, class) == "integer")])
test_clean[, !(sapply(test_clean, class) == "numeric" | sapply(test_clean, class) ==
"integer")]<-
as.numeric(test_clean[, !(sapply(test_clean, class) == "numeric" |
sapply(test_clean, class) == "integer")])
Dividing data into Continuous, categorical and Dummy variables
temp1<- data.frame(Variable_Type = c(
"Product Information",
"Insurance Age",
"Height",
"Weight",
"BMI",
"Employment Information",
"Insured Information",
"Insurance History",
"Family History",
"Medical History",
"Medical Keyword"))
temp1$Continous<-c(1,1,1,1,1,3,0,1,4,0,0)
temp1$Categorical<-c(6,0,0,0,0,3,7,8,1,41,0)
temp1$Dummy<-c(0,0,0,0,0,0,0,0,0,0,48)
temp1$Total<-rowSums(temp1[,-1])
temp1[12,2:5]<-colSums(temp1[,-1])
temp1$Variable_Type[12]<-"Total"
## Warning in `[<-.factor`(`*tmp*`, 12, value = structure(c(10L, 5L, 4L,
## 11L, : invalid factor level, NA generated
datatable(temp1, options = list(pageLength = 13,
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
"}")))
Continuous variables are analyzed using summary statistics, box plots and density plots. The categorical variables are analyzed using event rate chart to track the variation to the response.
The response is a ordinal variable with levels from 1 to 8 and associates to the risk level of a customer
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p<-ggplot(train, aes(x=Response))+geom_histogram(fill="Red", alpha=0.3)
ggplotly(p, color=~Response, width = 800, height = 400)%>%
layout(title="Distribution of Response Variable", plot_bgcolor= "white", xaxis=list(gridcolor="lightgrey", opacity=0.5), yaxis=list(gridcolor="lightgrey",opacity = 0.5),autosize = F, width = 800, height = 400)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
While it is not mentioned whether the scale is in increasing order of riskiness or otherwise, from the distribution of the response variable we can infer that 8 could possibly refer to the customer which are at high risk to likely take insurance while 1 can be people with low risk to take. insurance.
To allow for easier convergence of machine learning algorithms variables are normalized to the range of [0, 1]. The most common normalizing function used is given below:
X=xi−xmin/xmax−xmin
The same function had been applied to the continuous variables in the input data-set. The summary statistics help understand the distribution of the underlying dataset, the box plots and density plots enable visualizing the data-set
## Generating Summary Table
summ_conti<-data.frame(Variables = colnames(train_conti))
summ_conti$Min<-apply(train_conti,2,function(x){min(x, na.rm = T)})
summ_conti$Max<-apply(train_conti,2,function(x){max(x, na.rm = T)})
summ_conti$Mean<-apply(train_conti,2,function(x){mean(x, na.rm = T)})
summ_conti$Median<-apply(train_conti,2,function(x){median(x, na.rm = T)})
datatable(summ_conti, options = list(initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});",
"}")))
The box plots enable visualization of the data-set especially in relation to outliers. However considering the large number of data
library(ggplot2)
library(plotly)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
train_cont <- data.frame(train_conti, Response=train_clean$Response)
doPlots <- function(data.in, fun, ii, ncol=3) {
pp <- list()
for (i in ii) {
p <- fun(data.in=data.in, i=i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol=ncol))
}
plotBox <- function(data.in, i) {
data <- data.frame(y=data.in[,i], Response=data.in$Response)
p <- ggplot(data, aes(x=factor(Response), y=y)) + geom_boxplot() + ylab(colnames(data.in)[i]) + theme_light()
return (p)
}
doPlots(data.in=train_cont, fun=plotBox, ii=1:8,ncol=3)
The box plots enable visualization of the data-set especially in relation to outliers as well as Response variable. We can see BMI and Employment_Info_6 show variation with respect to Respinse variable so we can keep them and eliminate all other continuous variables.
train_clean<- subset(train_clean, select = -c(Product_Info_4, Ins_Age, Ht, Wt, Employment_Info_1, Employment_Info_4) )
test_clean<- subset(test_clean, select = -c(Product_Info_4, Ins_Age, Ht, Wt, Employment_Info_1, Employment_Info_4) )
The density plots help visualize the characteristics of the distribution including statistical metrics such as mean, standard deviation and kurtosis. It also enables us to visually identify if any relationship exists with the response variable. For example: The density plot of variable Employment_Info_6 is similar to the histogram of the response variable, this probably indicated that this variable could be a good predictor of the response variable
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:plotly':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
## The following object is masked from 'package:dplyr':
##
## rename
temp_melt<-melt(train_conti[,1:2])
## Using as id variables
p1<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
ggplotly(p1, height= 800, width = 1000)%>%
layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
temp_melt<-melt(train_conti[,c(3,4,5)])
## Using as id variables
p2<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
ggplotly(p2, height= 800, width = 1000)%>%
layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
temp_melt<-melt(train_conti[,c(6,8)])
## Using as id variables
p3<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
ggplotly(p3, height= 800, width = 1000)%>%
layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
temp_melt<-melt(train_conti[,7])
temp_melt$variable<-"Employment_Info_4"
p4<-ggplot(temp_melt,aes(value, fill = variable ))+geom_density(alpha = 0.5)+ggtitle("Density Plots")
ggplotly(p4, height= 800, width = 1000)%>%
layout(plot_bgcolor="transparent",paper_bgcolor= "transparent",autosize = F, width = 1000, height = 800)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
In an attempt to capture the conditional probability of the response given a specific bin of the categorical variable \[ P(y=1|ProdInfo_2= A_1)=\frac{P(y=1 \cap ProdInfo_2= A_1 )}{P(ProdInfo_2= A_1)} \] 1. Product Information
train_categ<-train_clean[,-which(colnames(train_clean) %in% colnames(train_conti))]
i="Product_Info"
train_temp<-train_categ[,grep(i,colnames(train_categ))]
index<-1
plt<-htmltools::tagList()
for (i in colnames(train_temp)){
data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
xaxis = list(title = gsub("_"," ",i),showgrid = T))
plt[[index]] <- as_widget(p)
index <- index + 1
}
plt
2.Employment Information
i="Employment_Info"
train_temp<-train_categ[,grep(i,colnames(train_categ))]
index<-1
plt<-htmltools::tagList()
for (i in colnames(train_temp)){
data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
xaxis = list(title = gsub("_"," ",i),showgrid = T))
plt[[index]] <- as_widget(p)
index <- index + 1
}
plt
i="InsuredInfo"
train_temp<-train_categ[,grep(i,colnames(train_categ))]
index<-1
plt<-htmltools::tagList()
for (i in colnames(train_temp)){
data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
xaxis = list(title = gsub("_"," ",i),showgrid = T))
plt[[index]] <- as_widget(p)
index <- index + 1
}
plt
4.Insurance History
i="Insurance_History"
train_temp<-train_categ[,grep(i,colnames(train_categ))]
index<-1
plt<-htmltools::tagList()
for (i in colnames(train_temp)){
data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
xaxis = list(title = gsub("_"," ",i),showgrid = T))
plt[[index]] <- as_widget(p)
index <- index + 1
}
plt
5.Medical History
par(mfrow=c(2,2))
i="Medical_History"
train_temp<-train_categ[,grep(i,colnames(train_categ))]
index<-1
plt<-htmltools::tagList()
for (i in colnames(train_temp)){
data_freq<-as.data.frame(table(train_temp[,i],train_clean$Response)/(as.data.frame(table(train_temp[,i]))[,2]))
p<-plot_ly(data_freq, x = ~Var1, y = ~Freq, color = ~Var2, type="bar")%>%
layout(title = paste0("Event Rate Chart- ",gsub("_"," ",i)),
xaxis = list(title = gsub("_"," ",i),showgrid = T))
plt[[index]] <- as_widget(p)
index <- index + 1
}
plt
# Creating a new column as a sum of all these column : MedKeywordSum
train_clean$MedKeywordSum <- rowSums(train_clean[,c(64:112)])
test_clean$MedKeywordSum <- rowSums(test_clean[,c(73:121)])
# Dropping Medical_Keyword_1 to Medical_Keyword_48 from dataset
train_clean <- subset(train_clean, select = -c(68:112) )
test_clean <- subset(test_clean, select = -c(73:121) )
# Creating a new column as a sum of all these column : MedHistSum
train_clean$MedHistSum <- rowSums(train_clean[,c(28:68)])
test_clean$MedHistSum <- rowSums(test_clean[,c(33:73)])
# Dropping Medical_hist_1 to Medical_hist_48 from dataset
train_clean <- subset(train_clean, select = -c(28:66) )
test_clean <- subset(test_clean, select = -c(33:72) )
#these were eliminated in training data due to alot of NA rows
test_clean<-subset(test_clean,select = -c(29:32))
train_clean<-subset(train_clean,select = -28)
A predictive model is built to predict response value using ** Multinomial Logistic Regression**. Below are the steps executed.
Preparing “test” dataset to contain same column as “train_clean” data set to use in predictive models.
Creating a Multinomila logistic regression model to predict Response.
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2018e.
## 1.0/zoneinfo/America/New_York'
library(nnet)
MultinomModel <- multinom(Response ~ ., data = train_clean)
## # weights: 248 (210 variable)
## initial value 123479.318186
## iter 10 value 109795.691165
## iter 20 value 106631.615491
## iter 30 value 103834.040782
## iter 40 value 100719.001427
## iter 50 value 98588.294990
## iter 60 value 98164.596527
## iter 70 value 97861.844228
## iter 80 value 97183.867580
## iter 90 value 95876.601964
## iter 100 value 94419.643160
## final value 94419.643160
## stopped after 100 iterations
predict_Response <- predict (MultinomModel, test_clean , "probs")
test_clean$Response <- predict (MultinomModel, test_clean)
submission <- test_clean[, c(1,31)]
write.csv(submission, "~/Desktop/Programming for analytics/week 6/Prudential/submission.csv", row.names = F)
Accuracy obtained from Kaggle Kappa is 0.36174 for this model. There is a lot of scope for improvement. Overall we can see that age, BMI, Family history and product type as well are main factors to assess risk of the insurance.